{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pandas 使用笔记\n",
    "\n",
    "Python 中处理二维表数据的核心， Python 中的 Excel\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "\n",
    "import numpy as np # 当然少不了和 numpy 搭配使用\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 基本 io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# csv 读取中的常见问题\n",
    "# windows 下的 csv 文件可能是 gb2312 编码，但读取时指定的 encoding 却是 gbk 或者是 gb18030，要多尝试\n",
    "# 文件路径中如果包含中文，engine 则要指定 python，这是因为 read_csv 方法底层实现是 C 的库\n",
    "df = pd.read_csv('filepath', encoding='gbk', engine='python')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 纵向合并某一文件夹内所有的 Excel 文件\n",
    "\n",
    "经典任务，实现方法有很多，但就是还没有一个统一简洁的官方方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def MERGE(workdir):\n",
    "    # 获取路径内全部文件，这里假设文件夹内有且只有目标的excel文件。\n",
    "    # 参杂其他东西也是有办法的，但这里先不涉及那么多\n",
    "    filenames = os.listdir(workdir)\n",
    "    filepath = []\n",
    "    for i in range(0,len(filenames)):\n",
    "        filepath.append(os.path.join(workdir,filenames[i]))\n",
    "        \n",
    "    # 遍历获取所有 excel 内容\n",
    "    content = []\n",
    "    \n",
    "    for file in filepath:\n",
    "        # pandas 读取 excel\n",
    "        # 根据需要可以增加参数:\n",
    "        #   sheet_name 指定工作簿 \n",
    "        #   skipfooter 忽略尾部 \n",
    "        #   dtype 指定列变量类型，详见文档\n",
    "        fileread = pd.read_excel(file)\n",
    "\n",
    "        # 全部加入 content 列表中\n",
    "        content.append(fileread)\n",
    "    \n",
    "    # 将 content 列表中的内容合并为 DataFrame\n",
    "    result = pd.concat(content) # concat 默认为纵向合并\n",
    "    \n",
    "    return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 横向合并\n",
    "\n",
    "同样有多种方法，可以实现类似 sql 的各种 join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge方法，还有更多的合并参数可以选择，详见文档\n",
    "df3 = df1.merge(df2, how='outer', on='column_name') # 指定合并根据列\n",
    "df3 = df1.merge(df2, how='outer', left_index=True, right_index=True) # 根据两侧索引合并\n",
    "\n",
    "# Join方法，常用于分割列后的返回合并\n",
    "split = df['column_name'].str.split(expand=True) # 默认按空格分割，且展开列\n",
    "df = df.join(split)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 更多具体操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = MERGE('filepath')\n",
    "\n",
    "# 重设 pandas 默认索引，常用于合并多个 dataframe 之后\n",
    "df.reset_index(drop=True, inplace=True)\n",
    "\n",
    "# 使用某一列作为索引\n",
    "df.set_index([''], inplace=True)\n",
    "\n",
    "# 输出，可选参数众多，有需要时详见 IO 文档\n",
    "df.to_excel('filename', sheet_name='', index=True, index_label='', merge_cells=False) # 分割合并单元格\n",
    "df.to_csv('filename')\n",
    "\n",
    "# 去除行/列\n",
    "df.drop(columns=['column_name'], inplace=True) # 关键在于 inplace，此参数默认为 false，只会生成一个新的对象\n",
    "df.drop(['column_name'], axis=0, inplace=True) # 效果同上，axis=0 即列\n",
    "\n",
    "# 赋值新建列\n",
    "df['new_column_name'] = 'new_value'\n",
    "df['index1'] = list(range(1, len(allmerge['index'])+1)) # 给一个list，新建一个从1开始的索引\n",
    "\n",
    "# 按某列取值筛选行（类似 excel 的筛选），可组合条件使用\n",
    "# 注意后面的负号，可以对布尔值取补集\n",
    "df[(df['column_name1']=='value1' & df['column_name2']=='value2') | df['column_name3'=='value3']]\n",
    "df = df[df['column_name']!='value' & -df['column_name'].str.contains('str1') & -df['column_name'].str.match('str2')]\n",
    "\n",
    "# 也可以使用 loc，下面语句筛选出了 number > 0 的 extract 列\n",
    "df.loc[df['number'] > 0, ['extract']]\n",
    "\n",
    "# 给定规则替换值，至少有 map 和 replace 两种方法\n",
    "# map 方法还可以使用函数，更名副其实。详见文档\n",
    "namemap = {\n",
    "'被替换值':'替换值',\n",
    "'中华人民共和国':'中国',\n",
    "}\n",
    "df['column_name'] = df['column_name'].map(namemap) # map 方法没有 inplace 参数，只能重新赋值\n",
    "\n",
    "# replace 方法，这里可以配合正则使用\n",
    "pattern1 = r'.*身份证$'\n",
    "pattern2 = r'.*签证$'\n",
    "# regex 参数用于启动正则模式\n",
    "df['column_name'].replace( [pattern1, pattern2], ['身份证', '签证'], regex=True, inplace=True)\n",
    "\n",
    "# 处理文本两侧可能有的多余空格\n",
    "df['column_name'] = df['column_name'].str.lstrip() # 左去除\n",
    "df['column_name'] = df['column_name'].str.rstrip() # 右去除\n",
    "\n",
    "# 处理 csv 中常见的引号文本标记\n",
    "df['column_name'] = df['column_name'].str.strip('\\'')\n",
    "\n",
    "# 列改名\n",
    "df.rename(columns = {'old_name':'new_name'}, inplace=True)\n",
    "\n",
    "# 删除缺失数据\n",
    "# axis 取 0 即逐行检查各列的缺失情况；how 取 any 即任意一列有缺失都会删除整行\n",
    "df.dropna(axis=0, how='any')\n",
    "\n",
    "# 检查空值\n",
    "pd.isnull(object)\n",
    "\n",
    "# 缺失值 nan 需要用 numpy 处理\n",
    "df.replace(to_replace=np.nan, value=0, inplace = True)\n",
    "\n",
    "# 对列进行函数操作\n",
    "df['col'].apply(lambda x:2 * x) # 将 col 列中所有值乘上 2\n",
    "\n",
    "# 数据排序\n",
    "df.sort_values(ascending=False)\n",
    "\n",
    "# 索引筛选，假设为时间序列数据。\n",
    "# 这一部分方法众多，特别是涉及到多层行/列结构的时候，需要时精读文档\n",
    "df = df.loc['2018-02-08':]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 日期处理\n",
    "\n",
    "日期列读入 pandas 后往往只是一般的 object 类型，需要先转换成 datetime 类型方便后续使用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# Datetime 日期格式转换。\n",
    "df['new_datetime'] = pd.to_datetime(df['old_datetime'])\n",
    "# 一次转换多列\n",
    "df[['time1', 'time2']] = df[['time1', 'time2']].apply(pd.to_datetime)\n",
    "\n",
    "# Datetime 日期拆解\n",
    "df['date'] = df['new_datetime'].dt.date\n",
    "df['time'] = df['new_datetime'].dt.time\n",
    "df['secs'] = df['datetime1'] - df['datetime2']\n",
    "df['secs'] = df['secs'].dt.total_seconds()\n",
    "df[['secs1','secs2']] = df[['secs1','secs2']].astype('timedelta64[h]') # 根据需要调整 timedelta 类型\n",
    "\n",
    "# Datetime 一般日期列筛选切片（非索引）\n",
    "# pandas 自身主要的时间序列处理方法都需要将时间列作为索引，限制比较多\n",
    "# 日期切片，日期的文本字符串可以用'-'或者'/'分割\n",
    "# df = df[df['datetime'].isin(pd.date_range('2000-01-01','2000-02-01'))]\n",
    "\n",
    "# 注意：datetime 格式列不能直接使用上面的isin(pd.date_range())方法，只能用下面的比较\n",
    "\n",
    "# 也可以直接比较字符串日期，但有可能出现类型问题。要用 to_datetime 转换\n",
    "df = df[df['datetime'] >= '2000-01-01']\n",
    "df = df[df['datetime'] >= pd.to_datetime('2000-01-01')]\n",
    "# 注意：传入的字符串默认为 00:00，也就是说日期左闭右开\n",
    "# 例如，需要 1 月份的日期时\n",
    "df = df[(df['datetime'] >= pd.to_datetime('2000-01-01')) & (df['datetime'] < pd.to_datetime('2000-02-01'))]  # 正确\n",
    "df = df[(df['datetime'] >= pd.to_datetime('2000-01-01')) & (df['datetime'] <= pd.to_datetime('2000-01-31'))]  # 错误，会丢掉 1 月 31 日的数据\n",
    "\n",
    "# 索引列为 datetime 的切片方式\n",
    "# 日内小时切片\n",
    "df = df.between_time('23:00','00:00')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据透视表\n",
    "\n",
    "类似 Excel 的效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pivot 数据透视表\n",
    "\n",
    "df_pivot = df.pivot_table(values=['value_column_name'], \n",
    "                          index=['row_column_name'], \n",
    "                          columns=['column_column_name', \n",
    "                                   aggfunc='pd.Series.nunique', \n",
    "                                   margins=True)\n",
    "# 参数：\n",
    "#   Pandas  | Excel\n",
    "#   values  |  值\n",
    "#   index   |  行\n",
    "#   columns |  列\n",
    "#   aggfunc | 汇总方法\n",
    "#  margins-是否显示加总行/列\n",
    "\n",
    "# pivot 表 stack\n",
    "df_pivot = df_pivot.stack().to_frame() # 小心 stack()方法生成的是个 series 对象，还需要进一步转化为 dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 描述性统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# 基本的数据集特征信息\n",
    "df.info()\n",
    "\n",
    "# 基本的数据集统计信息\n",
    "df.describe()\n",
    "\n",
    "# 所有列的情况\n",
    "df.columns\n",
    "df.dtypes\n",
    "\n",
    "# 某列的全部可能取值\n",
    "df['col'].unique()\n",
    "\n",
    "# 某列各可能取值的频次统计\n",
    "df['col'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 分组处理\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "inputHidden": false,
    "outputHidden": false
   },
   "outputs": [],
   "source": [
    "# 分组排名（按数值）：可以处理同分同排名，排名百分比等问题\n",
    "df.groupby.rank\n",
    "\n",
    "# 分组排序（按位置）\n",
    "df.groupby.cumcount\n",
    "\n",
    "# 分组描述性统计\n",
    "df.groupby.agg(['mean', 'std'])\n",
    "\n",
    "# 分组数据应用某项变换\n",
    "def zscore(x):\n",
    "    return (x - x.mean()) / x.std()\n",
    "df.groupby.transform(zscore)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 管道"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dtype_mapping = {'a':np.float32, 'b':np.uint8, 'c':np.float64, 'd':np.int64, 'e':str}\n",
    "df_cleaned = (df\n",
    "  .pipe(pd.DataFrame.sort_index, ascending=False)    #按索引排序\n",
    "  .pipe(pd.DataFrame.fillna,value=0, method='ffill') #缺失值处理\n",
    "  .pipe(pd.DataFrame.astype, dtype_mapping)          #数据类型变换\n",
    "  .pipe(pd.DataFrame.clip, lower= -0.05, upper=0.05) #极端值处理\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 类 SQL 查询"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用query\n",
    "df.query(\"(a>0 and b<0.05) or c>b\")\n",
    "#普通方法\n",
    "df.loc[((df['a']>0) & (df['b']<0.05))| (df['c']>df['b'])]\n",
    "\n",
    "# 注意，query的字符串中也可以用 @ 符号引用当前命名空间内的变量"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 多重索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 可以使用 pd.IndexSlice 来做更自然的语法\n",
    "idx = pd.IndexSlice\n",
    "df.loc[idx[:,'level2_i'], 'column'] # level1全选"
   ]
  }
 ],
 "metadata": {
  "kernel_info": {
   "name": "python3"
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "nteract": {
   "version": "0.14.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
